#install.packages("softmaxreg")
#install.packages("DMwR")
#install.packages("esquisse")
#install.packages("inspectdf")
#install.packages("DataExplorer")
# Load packages
library(tidyverse)
library(ggplot2)
library(GGally)
library(stringr)
library(gridExtra)
library(scales)
library(ggthemes)
library(gghighlight)
library(janitor)
library(MultinomialCI)
library(htmlTable)
library(formattable)
library(DMwR)
library(esquisse)
library(inspectdf)
# Stop scientific notation
options(scipen = 999)
Airbnb, Inc. is an online marketplace for arranging or offering lodging, primarily homestays, or tourism experiences. The company does not own any of the real estate listings, nor does it host events; it acts as a broker, receiving commissions from each booking. In this project, I would like to explore Seattle Airbnb Market, and provide some recommendations to Airbnb hosts and person who wants to host home in the future.
I will analyze the Seattle Airbnb Market data and determinging key findings, they are:
Be a superhost
Provide resonable price
Make your Airbnb home instant bookable
Host the home with reasonable accommodates
Response your customer’s questions
Make flexible or moderate cancellation policies
choose the right neighbourhood group
# Load data
sea_airbnb <-read.csv("listings.csv")
# clean data
# look at the data
str(sea_airbnb)
summary(sea_airbnb)
# change column names
names(sea_airbnb)[9] <- "neighbourhood"
names(sea_airbnb)[10] <- "neighbourhood_group"
# delete NAs and unused levels
sea_airbnb <- sea_airbnb[complete.cases(sea_airbnb), ]
sea_airbnb <- sea_airbnb[!(sea_airbnb$host_response_rate=="N/A"),]
sea_airbnb$host_response_time <- droplevels(sea_airbnb$host_response_time)
sea_airbnb$host_is_superhost <- droplevels(sea_airbnb$host_is_superhost)
sea_airbnb$host_response_rate <- droplevels(sea_airbnb$host_response_rate)
# extract year only from host_since variable
sea_airbnb$host_since <- strptime(as.character(sea_airbnb$host_since), "%m/%d/%y")
sea_airbnb$host_since <- substring(sea_airbnb$host_since, 1, 4)
sea_airbnb$host_since <- as.numeric(as.character(sea_airbnb$host_since))
# convert factor variables to numerical variables
sea_airbnb$host_response_rate <- as.numeric(sub("%", "", sea_airbnb$host_response_rate,fixed=TRUE))/100
sea_airbnb$price <- as.numeric(sub("$", "", sea_airbnb$price,fixed=TRUE))
# clean NA in price variable
sea_airbnb <- sea_airbnb[complete.cases(sea_airbnb), ]
# look at the data after cleaning
head(sea_airbnb)
## id host_id host_name host_since host_response_time host_response_rate
## 1 2318 2536 Megan 2008 within an hour 1
## 3 6606 14942 Joyce 2009 within a few hours 1
## 4 9419 30559 Angielena 2009 within a few hours 1
## 5 9460 30832 Siena 2009 within an hour 1
## 6 9531 31481 Cassie 2009 within a few hours 1
## 7 9534 31481 Cassie 2009 within a few hours 1
## host_is_superhost host_total_listings_count neighbourhood
## 1 t 2 Madrona
## 3 f 5 Wallingford
## 4 t 8 Georgetown
## 5 t 4 First Hill
## 6 t 2 Fairmount Park
## 7 t 2 Fairmount Park
## neighbourhood_group zipcode latitude longitude property_type
## 1 Central Area 98122 47.61082 -122.2908 House
## 3 Other neighborhoods 98103 47.65411 -122.3376 Guesthouse
## 4 Other neighborhoods 98108 47.55062 -122.3201 Apartment
## 5 Downtown 98101 47.61265 -122.3294 Condominium
## 6 West Seattle 98136 47.55539 -122.3847 House
## 7 West Seattle 98136 47.55624 -122.3860 Guest suite
## room_type accommodates bathrooms bedrooms beds price
## 1 Entire home/apt 9 2.5 4 4 296
## 3 Entire home/apt 2 1.0 1 1 90
## 4 Private room 2 3.0 1 1 62
## 5 Private room 2 1.0 1 1 99
## 6 Entire home/apt 4 1.0 2 3 165
## 7 Entire home/apt 3 1.0 2 2 125
## number_of_reviews review_scores_rating instant_bookable
## 1 28 100 f
## 3 147 92 f
## 4 144 93 f
## 5 443 98 t
## 6 37 100 f
## 7 44 100 f
## cancellation_policy reviews_per_month
## 1 strict_14_with_grace_period 0.21
## 3 strict_14_with_grace_period 1.19
## 4 moderate 1.29
## 5 moderate 3.62
## 6 strict_14_with_grace_period 0.39
## 7 strict_14_with_grace_period 0.47
Data appears tidy and ready for analysis
# add weighted review score column
#sea_airbnb$year_weight <- SoftMax(max(sea_airbnb$host_since)-sea_airbnb$host_since)
sea_airbnb$year_weight <- SoftMax(sea_airbnb$host_since)
sea_airbnb$review_weight <- log(sea_airbnb$number_of_reviews)
sea_airbnb$weighted_score <- sea_airbnb$review_scores_rating*sea_airbnb$year_weight*sea_airbnb$review_weight
# convert numeric weighted review score data to categorical performance
sea_airbnb$performance[sea_airbnb$weighted_score >= 0 & sea_airbnb$weighted_score <= 20] = "Bad"
sea_airbnb$performance[sea_airbnb$weighted_score > 20 & sea_airbnb$weighted_score <= 120] = "Poor"
sea_airbnb$performance[sea_airbnb$weighted_score > 120 & sea_airbnb$weighted_score <= 300] = "Fair"
sea_airbnb$performance[sea_airbnb$weighted_score > 300 & sea_airbnb$weighted_score <= 400] = "Good"
sea_airbnb$performance[sea_airbnb$weighted_score > 400] = "Excellent"
sea_airbnb$performance = factor(sea_airbnb$performance, levels=c("Bad", "Poor", "Fair", "Good", "Excellent"))
# change level names for variables
levels(sea_airbnb$host_is_superhost) <- c("No", "Yes")
levels(sea_airbnb$instant_bookable) <- c("No", "Yes")
# adjusted columns for analysis use
sea_airbnb <- select(sea_airbnb, -year_weight, -review_weight)
# look at the data
summary(sea_airbnb)
## id host_id host_name
## Min. : 2318 Min. : 862 Corp Condos & Apts: 233
## 1st Qu.:12908346 1st Qu.: 7911180 Day 1 : 114
## Median :21357272 Median : 26967583 Addison : 92
## Mean :20375502 Mean : 57739200 Loftium : 84
## 3rd Qu.:28635671 3rd Qu.: 82961680 Dario : 82
## Max. :38649181 Max. :293180955 Melissa : 73
## (Other) :5623
## host_since host_response_time host_response_rate
## Min. :2008 a few days or more: 19 Min. :0.0000
## 1st Qu.:2013 within a day : 248 1st Qu.:1.0000
## Median :2015 within a few hours: 687 Median :1.0000
## Mean :2015 within an hour :5347 Mean :0.9829
## 3rd Qu.:2016 3rd Qu.:1.0000
## Max. :2019 Max. :1.0000
##
## host_is_superhost host_total_listings_count neighbourhood
## No :2994 Min. : 0.00 Broadway : 372
## Yes:3307 1st Qu.: 1.00 Belltown : 358
## Median : 2.00 Wallingford: 242
## Mean : 83.92 First Hill : 238
## 3rd Qu.: 9.00 Minor : 235
## Max. :1795.00 Fremont : 202
## (Other) :4654
## neighbourhood_group zipcode latitude
## Other neighborhoods:1195 98122 : 651 Min. :47.50
## Downtown :1091 98103 : 591 1st Qu.:47.60
## Capitol Hill : 639 98101 : 471 Median :47.62
## Central Area : 574 98144 : 342 Mean :47.62
## Queen Anne : 449 98121 : 331 3rd Qu.:47.66
## West Seattle : 362 98109 : 328 Max. :47.74
## (Other) :1991 (Other):3587
## longitude property_type room_type
## Min. :-122.4 Apartment :1985 Entire home/apt:4833
## 1st Qu.:-122.4 House :1953 Hotel room : 111
## Median :-122.3 Guest suite: 802 Private room :1276
## Mean :-122.3 Townhouse : 490 Shared room : 81
## 3rd Qu.:-122.3 Condominium: 384
## Max. :-122.2 Guesthouse : 223
## (Other) : 464
## accommodates bathrooms bedrooms beds
## Min. : 1.000 Min. : 0.000 Min. :0.000 Min. : 0.000
## 1st Qu.: 2.000 1st Qu.: 1.000 1st Qu.:1.000 1st Qu.: 1.000
## Median : 3.000 Median : 1.000 Median :1.000 Median : 1.000
## Mean : 3.771 Mean : 1.299 Mean :1.372 Mean : 1.944
## 3rd Qu.: 4.000 3rd Qu.: 1.000 3rd Qu.:2.000 3rd Qu.: 2.000
## Max. :28.000 Max. :16.000 Max. :8.000 Max. :49.000
##
## price number_of_reviews review_scores_rating instant_bookable
## Min. : 0.0 Min. : 1.00 Min. : 20.00 No :2876
## 1st Qu.: 80.0 1st Qu.: 10.00 1st Qu.: 94.00 Yes:3425
## Median :119.0 Median : 35.00 Median : 97.00
## Mean :156.8 Mean : 65.17 Mean : 95.15
## 3rd Qu.:180.0 3rd Qu.: 90.00 3rd Qu.: 99.00
## Max. :999.0 Max. :767.00 Max. :100.00
##
## cancellation_policy reviews_per_month weighted_score
## flexible :1093 Min. : 0.010 Min. : 0.00
## moderate :2477 1st Qu.: 0.840 1st Qu.: 15.71
## strict : 267 Median : 2.200 Median :124.77
## strict_14_with_grace_period:2398 Mean : 2.771 Mean :156.44
## super_strict_30 : 51 3rd Qu.: 4.160 3rd Qu.:280.45
## super_strict_60 : 15 Max. :14.870 Max. :562.52
##
## performance
## Bad :1645
## Poor :1442
## Fair :1869
## Good : 940
## Excellent: 405
##
##
New variables explanation:
Data observations
Questions on data
# Examine data structure after cleaning
str(sea_airbnb)
## 'data.frame': 6301 obs. of 27 variables:
## $ id : int 2318 6606 9419 9460 9531 9534 9596 9909 11012 14386 ...
## $ host_id : int 2536 14942 30559 30832 31481 31481 14942 33360 14942 39377 ...
## $ host_name : Factor w/ 2449 levels "","'Keia","(Email hidden by Airbnb)",..: 1583 1109 140 2081 400 400 1109 1325 1109 286 ...
## $ host_since : num 2008 2009 2009 2009 2009 ...
## $ host_response_time : Factor w/ 4 levels "a few days or more",..: 4 3 3 4 3 3 3 4 3 2 ...
## $ host_response_rate : num 1 1 1 1 1 1 1 1 1 1 ...
## $ host_is_superhost : Factor w/ 2 levels "No","Yes": 2 1 2 2 2 2 1 2 1 1 ...
## $ host_total_listings_count: int 2 5 8 4 2 2 5 8 5 3 ...
## $ neighbourhood : Factor w/ 89 levels "Adams","Alki",..: 43 82 25 21 19 19 82 19 82 26 ...
## $ neighbourhood_group : Factor w/ 17 levels "Ballard","Beacon Hill",..: 5 12 12 7 17 17 12 17 12 12 ...
## $ zipcode : Factor w/ 34 levels ""," ","90105",..: 22 7 13 5 27 27 7 24 7 7 ...
## $ latitude : num 47.6 47.7 47.6 47.6 47.6 ...
## $ longitude : num -122 -122 -122 -122 -122 ...
## $ property_type : Factor w/ 29 levels "Apartment","Bed and breakfast",..: 18 15 1 10 18 14 1 18 18 18 ...
## $ room_type : Factor w/ 4 levels "Entire home/apt",..: 1 1 3 3 1 1 1 1 1 3 ...
## $ accommodates : int 9 2 2 2 4 3 4 8 8 2 ...
## $ bathrooms : num 2.5 1 3 1 1 1 1 2 2 1.5 ...
## $ bedrooms : int 4 1 1 1 2 2 1 3 3 1 ...
## $ beds : int 4 1 1 1 3 2 4 5 3 1 ...
## $ price : num 296 90 62 99 165 125 120 125 299 40 ...
## $ number_of_reviews : int 28 147 144 443 37 44 91 71 91 141 ...
## $ review_scores_rating : int 100 92 93 98 100 100 91 96 96 92 ...
## $ instant_bookable : Factor w/ 2 levels "No","Yes": 1 1 1 2 1 1 1 2 1 1 ...
## $ cancellation_policy : Factor w/ 6 levels "flexible","moderate",..: 4 4 2 2 4 4 4 4 4 4 ...
## $ reviews_per_month : num 0.21 1.19 1.29 3.62 0.39 0.47 0.9 0.59 0.78 1.27 ...
## $ weighted_score : num 0.0299 0.1707 0.1718 0.222 0.1342 ...
## $ performance : Factor w/ 5 levels "Bad","Poor","Fair",..: 1 1 1 1 1 1 1 1 1 1 ...
Now, I will explore some important variable individually.
I selected host_response_time, host_is_superhost, neighbourhood, neighbourhood_group, zipcode, property_type, room_type, instant_bookable, cancellation_policy, and performance to examine individually.
grid.arrange(
ggplot(data = sea_airbnb, mapping = aes(x = host_response_time)) +
geom_bar(),
ggplot(data = sea_airbnb, mapping = aes(x = host_is_superhost)) +
geom_bar(),
ncol = 1
)
Comments
- Most of the hosts response within an hour.
- Superhost is a little bit more than non-superhost.
grid.arrange(
ggplot(data = sea_airbnb, mapping = aes(x = neighbourhood)) +
geom_bar() +
theme(axis.text.x = element_text(size = 5, angle = 90)),
ggplot(data = sea_airbnb, mapping = aes(x = neighbourhood_group)) +
geom_bar() +
theme(axis.text.x = element_text(size = 8, angle = 20)),
ncol = 1
)
Comments
- For neighbourhood, Belltown Broadway have the most number of Airbnbs.
- For grouped neighbourhood, Downtown has the most number of Airbnbs (besides Other neighbourhoods).
ggplot(data = sea_airbnb, mapping = aes(x = zipcode)) +
geom_bar() +
theme(axis.text.x = element_text(size = 5, angle = 30))
Comments
- 98101, 98103, and 98122 have the most number of Airbnbs
grid.arrange(
ggplot(data = sea_airbnb, mapping = aes(x = property_type)) +
geom_bar() +
theme(axis.text.x = element_text(size = 5, angle = 30)),
ggplot(data = sea_airbnb, mapping = aes(x = room_type)) +
geom_bar(),
ncol = 1
)
Comments
- Most of the Airbnb property types in Seattle are apartment and house.
- Most of the Airbnb room type in Seattle is entire home/apt.
grid.arrange(
ggplot(data = sea_airbnb, mapping = aes(x = instant_bookable)) +
geom_bar(),
ggplot(data = sea_airbnb, mapping = aes(x = cancellation_policy)) +
geom_bar() +
theme(axis.text.x = element_text(size = 10, angle = 10)),
ncol = 1
)
Comments
- The instant bookable Airbnb is more than non-instant bookable Airbnb.
- Most of the Airbnbs have flexible, moderate or strict 14 with grace period cancellation policy.
ggplot(data = sea_airbnb, mapping = aes(x = performance)) +
geom_bar()
Comments
Performance variable is an overall ranking based on review numbers, host years, and review scores (i.e: weighted score).
We can find that nearly half of the Airbnb in Seattle don’t have a very good performance.
I will examine on:
- host_total_listings_count
- host_response_rate
- accommodates
- bedrooms
- price
- number_of_reviews
- reviews_per_month
- weighted_score
grid.arrange(
ggplot(data = sea_airbnb, mapping = aes(x = host_total_listings_count)) +
geom_histogram(),
ggplot(data = sea_airbnb, mapping = aes(x = 1, y = host_total_listings_count)) +
geom_boxplot() +
coord_flip(),
ncol = 1
)
Comments
grid.arrange(
ggplot(data = sea_airbnb, mapping = aes(x = host_response_rate)) +
geom_histogram(),
ggplot(data = sea_airbnb, mapping = aes(x = 1, y = host_response_rate)) +
geom_boxplot() +
coord_flip(),
ncol = 1
)
Comments
The distribution of host_response_rate is skewed
grid.arrange(
ggplot(data = sea_airbnb, mapping = aes(x = accommodates)) +
geom_bar(),
ggplot(data = sea_airbnb, mapping = aes(x = 1, y = accommodates)) +
geom_boxplot() +
coord_flip(),
ncol = 1
)
Comments
Confirm the accommodates is a skewed distribution
grid.arrange(
ggplot(data = sea_airbnb, mapping = aes(x = bedrooms)) +
geom_bar(),
ggplot(data = sea_airbnb, mapping = aes(x = 1, y = bedrooms)) +
geom_boxplot() +
coord_flip(),
ncol = 1
)
Comments
Confirms the bedrooms variable is a skewed distribution
grid.arrange(
ggplot(data = sea_airbnb, mapping = aes(x = price)) +
geom_bar(),
ggplot(data = sea_airbnb, mapping = aes(x = 1, y = price)) +
geom_boxplot() +
coord_flip(),
ncol = 1
)
Comments
The distribution is unsymmetric
grid.arrange(
ggplot(data = sea_airbnb, mapping = aes(x = number_of_reviews)) +
geom_bar(),
ggplot(data = sea_airbnb, mapping = aes(x = 1, y = number_of_reviews)) +
geom_boxplot() +
coord_flip(),
ncol = 1
)
grid.arrange(
ggplot(data = sea_airbnb, mapping = aes(x = reviews_per_month)) +
geom_bar(),
ggplot(data = sea_airbnb, mapping = aes(x = 1, y = reviews_per_month)) +
geom_boxplot() +
coord_flip(),
ncol = 1
)
Comments
The distribution is unsymmetric
grid.arrange(
ggplot(data = sea_airbnb, mapping = aes(x = review_scores_rating)) +
geom_bar(),
ggplot(data = sea_airbnb, mapping = aes(x = 1, y = review_scores_rating)) +
geom_boxplot() +
coord_flip(),
ncol = 1
)
Comments
Confirms the review scores rating is skewed
Questions
Do these scores objectively reflect the rating based on different number of reviews?
# Correlation table
sea_airbnb %>%
select_if(is.numeric) %>%
select(-id, -host_id, -host_since, -latitude, -longitude) %>%
cor() %>%
round(2)
## host_response_rate host_total_listings_count
## host_response_rate 1.00 0.01
## host_total_listings_count 0.01 1.00
## accommodates 0.01 -0.02
## bathrooms 0.01 -0.01
## bedrooms 0.00 -0.05
## beds 0.01 -0.06
## price 0.01 0.47
## number_of_reviews 0.07 -0.19
## review_scores_rating 0.09 -0.23
## reviews_per_month 0.11 -0.23
## weighted_score 0.08 -0.23
## accommodates bathrooms bedrooms beds price
## host_response_rate 0.01 0.01 0.00 0.01 0.01
## host_total_listings_count -0.02 -0.01 -0.05 -0.06 0.47
## accommodates 1.00 0.52 0.80 0.82 0.46
## bathrooms 0.52 1.00 0.58 0.47 0.32
## bedrooms 0.80 0.58 1.00 0.71 0.40
## beds 0.82 0.47 0.71 1.00 0.37
## price 0.46 0.32 0.40 0.37 1.00
## number_of_reviews -0.07 -0.10 -0.12 -0.07 -0.17
## review_scores_rating 0.03 0.02 0.07 0.03 -0.09
## reviews_per_month -0.02 -0.14 -0.15 -0.05 -0.21
## weighted_score 0.04 -0.03 -0.01 0.02 -0.13
## number_of_reviews review_scores_rating
## host_response_rate 0.07 0.09
## host_total_listings_count -0.19 -0.23
## accommodates -0.07 0.03
## bathrooms -0.10 0.02
## bedrooms -0.12 0.07
## beds -0.07 0.03
## price -0.17 -0.09
## number_of_reviews 1.00 0.14
## review_scores_rating 0.14 1.00
## reviews_per_month 0.57 0.15
## weighted_score 0.21 0.15
## reviews_per_month weighted_score
## host_response_rate 0.11 0.08
## host_total_listings_count -0.23 -0.23
## accommodates -0.02 0.04
## bathrooms -0.14 -0.03
## bedrooms -0.15 -0.01
## beds -0.05 0.02
## price -0.21 -0.13
## number_of_reviews 0.57 0.21
## review_scores_rating 0.15 0.15
## reviews_per_month 1.00 0.44
## weighted_score 0.44 1.00
Comments
number of reviews has some negative correlations with price.
weighted_score has some negative correlations with price.
Questions
For categorical data, I’ll take a look at the relationships across different categorical variables:
sea_airbnb %>%
tabyl(host_response_time, host_is_superhost) %>%
adorn_totals(where = c("row", "col")) %>%
adorn_percentages(denominator = "col") %>%
adorn_rounding(2)
## host_response_time No Yes Total
## a few days or more 0.01 0.00 0.00
## within a day 0.06 0.02 0.04
## within a few hours 0.11 0.11 0.11
## within an hour 0.83 0.86 0.85
## Total 1.00 1.00 1.00
Comments
Superhost and non-superhost have the similar distributions of percentages on host response time.
Indicates we can add one more quantitative variables (i.e: price) to take a further look
sea_airbnb %>%
tabyl(host_is_superhost, neighbourhood_group) %>%
adorn_totals(where = c("row", "col")) %>%
adorn_percentages(denominator = "all") %>%
adorn_rounding(2)
## host_is_superhost Ballard Beacon Hill Capitol Hill Cascade Central Area
## No 0.02 0.02 0.05 0.02 0.03
## Yes 0.03 0.02 0.05 0.02 0.06
## Total 0.05 0.04 0.10 0.04 0.09
## Delridge Downtown Interbay Lake City Magnolia Northgate
## 0.01 0.12 0 0.01 0.01 0.02
## 0.02 0.05 0 0.01 0.01 0.01
## 0.03 0.17 0 0.02 0.02 0.03
## Other neighborhoods Queen Anne Rainier Valley Seward Park
## 0.08 0.03 0.02 0.00
## 0.11 0.04 0.03 0.01
## 0.19 0.07 0.05 0.01
## University District West Seattle Total
## 0.02 0.02 0.48
## 0.01 0.03 0.52
## 0.03 0.06 1.00
Comments
sea_airbnb %>%
tabyl(instant_bookable, host_is_superhost) %>%
adorn_totals(where = c("row", "col")) %>%
adorn_percentages(denominator = "all") %>%
adorn_rounding(2)
## instant_bookable No Yes Total
## No 0.19 0.27 0.46
## Yes 0.29 0.26 0.54
## Total 0.48 0.52 1.00
Comments
sea_airbnb %>%
tabyl(host_is_superhost, cancellation_policy) %>%
adorn_totals(where = c("row", "col")) %>%
adorn_percentages(denominator = "all") %>%
adorn_rounding(3)
## host_is_superhost flexible moderate strict strict_14_with_grace_period
## No 0.072 0.147 0.037 0.210
## Yes 0.102 0.246 0.005 0.171
## Total 0.173 0.393 0.042 0.381
## super_strict_30 super_strict_60 Total
## 0.008 0.002 0.475
## 0.000 0.000 0.525
## 0.008 0.002 1.000
Comments
Most of the superhost do not have very strict cancellation policy.
Need to combine quantitative variables to take a look
sea_airbnb %>%
tabyl(neighbourhood_group, room_type) %>%
adorn_totals(where = c("row", "col")) %>%
adorn_percentages(denominator = "all") %>%
adorn_rounding(3)
## neighbourhood_group Entire home/apt Hotel room Private room Shared room
## Ballard 0.044 0.000 0.009 0.000
## Beacon Hill 0.025 0.000 0.013 0.000
## Capitol Hill 0.077 0.001 0.019 0.004
## Cascade 0.023 0.011 0.003 0.000
## Central Area 0.069 0.000 0.022 0.000
## Delridge 0.019 0.000 0.013 0.000
## Downtown 0.162 0.006 0.004 0.001
## Interbay 0.002 0.000 0.000 0.000
## Lake City 0.011 0.000 0.007 0.000
## Magnolia 0.014 0.000 0.004 0.000
## Northgate 0.015 0.000 0.011 0.000
## Other neighborhoods 0.135 0.000 0.051 0.004
## Queen Anne 0.059 0.000 0.012 0.000
## Rainier Valley 0.037 0.000 0.017 0.000
## Seward Park 0.009 0.000 0.003 0.000
## University District 0.018 0.000 0.005 0.003
## West Seattle 0.049 0.000 0.009 0.000
## Total 0.767 0.018 0.203 0.013
## Total
## 0.053
## 0.037
## 0.101
## 0.037
## 0.091
## 0.033
## 0.173
## 0.002
## 0.018
## 0.018
## 0.025
## 0.190
## 0.071
## 0.055
## 0.012
## 0.027
## 0.057
## 1.000
sea_airbnb %>%
tabyl(neighbourhood_group, instant_bookable) %>%
adorn_totals(where = c("row", "col")) %>%
adorn_percentages(denominator = "all") %>%
adorn_rounding(3)
## neighbourhood_group No Yes Total
## Ballard 0.027 0.026 0.053
## Beacon Hill 0.017 0.020 0.037
## Capitol Hill 0.046 0.055 0.101
## Cascade 0.011 0.026 0.037
## Central Area 0.043 0.049 0.091
## Delridge 0.017 0.015 0.033
## Downtown 0.043 0.130 0.173
## Interbay 0.001 0.001 0.002
## Lake City 0.011 0.008 0.018
## Magnolia 0.010 0.008 0.018
## Northgate 0.013 0.012 0.025
## Other neighborhoods 0.106 0.084 0.190
## Queen Anne 0.032 0.039 0.071
## Rainier Valley 0.028 0.026 0.055
## Seward Park 0.007 0.004 0.012
## University District 0.015 0.011 0.027
## West Seattle 0.030 0.028 0.057
## Total 0.456 0.544 1.000
Comments
Questions
sea_airbnb %>%
tabyl(cancellation_policy, instant_bookable) %>%
adorn_totals(where = c("row", "col")) %>%
adorn_percentages(denominator = "all") %>%
adorn_rounding(3)
## cancellation_policy No Yes Total
## flexible 0.070 0.103 0.173
## moderate 0.197 0.196 0.393
## strict 0.005 0.037 0.042
## strict_14_with_grace_period 0.184 0.197 0.381
## super_strict_30 0.000 0.008 0.008
## super_strict_60 0.000 0.002 0.002
## Total 0.456 0.544 1.000
sea_airbnb %>%
tabyl(performance, host_is_superhost) %>%
adorn_totals(where = c("row", "col"))
## performance No Yes Total
## Bad 885 760 1645
## Poor 791 651 1442
## Fair 962 907 1869
## Good 287 653 940
## Excellent 69 336 405
## Total 2994 3307 6301
Comments
There is no apparent difference for bad, poor and fair group across by host_is_superhost.
But superhost performs much better in good and excellent group
Questons
sea_airbnb %>%
tabyl(instant_bookable, performance) %>%
adorn_totals(where = c("row", "col"))
## instant_bookable Bad Poor Fair Good Excellent Total
## No 852 709 824 349 142 2876
## Yes 793 733 1045 591 263 3425
## Total 1645 1442 1869 940 405 6301
Comments
Questons
# host_response_time across host_is_superhost
sea_airbnb %>%
ggplot(mapping = aes(x = host_response_time, fill = host_is_superhost)) +
geom_bar(position = "dodge")
# instant_bookable across host_is_superhost
sea_airbnb %>%
ggplot(mapping = aes(x = instant_bookable, fill = host_is_superhost)) +
geom_bar(position = "dodge")
Comments
# cancellation_policy across host_is_superhost
sea_airbnb %>%
ggplot(mapping = aes(x = cancellation_policy, fill = host_is_superhost)) +
geom_bar(position = "dodge") +
theme(axis.text.x = element_text(size = 10, angle = 15))
Comments
Questions
Will cancellation policies affect the price?
Will cancellation policies affect the weighted_score?
# neighbourhood_group across room_type
sea_airbnb %>%
ggplot(mapping = aes(x = neighbourhood_group, fill = room_type)) +
geom_bar(position = "dodge") +
theme(axis.text.x = element_text(size = 5, angle = 15))
# neighbourhood_group across instant_bookable
sea_airbnb %>%
ggplot(mapping = aes(x = neighbourhood_group, fill = instant_bookable)) +
geom_bar(position = "dodge") +
theme(axis.text.x = element_text(size = 5, angle = 15))
# neighbourhood_group across cancellation_policy
sea_airbnb %>%
ggplot(mapping = aes(x = neighbourhood_group, fill = cancellation_policy)) +
geom_bar(position = "dodge") +
theme(axis.text.x = element_text(size = 5, angle = 15))
Comments
Questions
# cancellation_policy across instant_bookable
sea_airbnb %>%
ggplot(mapping = aes(x = cancellation_policy, fill = instant_bookable)) +
geom_bar(position = "dodge") +
theme(axis.text.x = element_text(size = 10, angle = 15))
Comments
# performance across host_is_superhost
sea_airbnb %>%
ggplot(mapping = aes(x = performance, fill = host_is_superhost)) +
geom_bar(position = "dodge")
Comments
# performance across property_type
sea_airbnb %>%
ggplot(mapping = aes(x = property_type, fill = performance)) +
geom_bar(position = "dodge") +
theme(axis.text.x = element_text(size = 5, angle = 15))
Comments
appears to be a difference in performance across property_type
need to take a closer look
Comments
# performance across instant_bookable
sea_airbnb %>%
ggplot(mapping = aes(x = performance, fill = instant_bookable)) +
geom_bar(position = "dodge")
Comments
# performance across cancellation_policy
sea_airbnb %>%
ggplot(mapping = aes(x = cancellation_policy, fill = performance)) +
geom_bar(position = "dodge") +
theme(axis.text.x = element_text(size = 10, angle = 15))
Comments
Relatively flexible or moderate cancellation policies have more possibility to get a better performance.
# more detail on performance and property_type
grid.arrange(
sea_airbnb %>%
ggplot(mapping = aes(x = property_type, fill = performance)) +
geom_bar(position = "dodge") +
coord_flip() +
theme(axis.text.y = element_text(size = 5)),
sea_airbnb %>%
ggplot(mapping = aes(x = property_type, fill = performance)) +
geom_bar(position = "fill") +
coord_flip() +
theme(axis.text.y = element_text(size = 5)),
ncol = 1
)
Comments
Farm stay and Boutique hotel have the best performance overall.
Townhouse, Tinyhouse, Loft, House, Guesthouse, Guest suite, Cottage, Condominium, Boat, and Apartment have the similar performance distributions.
# more detail on performance and cancellation policy
grid.arrange(
sea_airbnb %>%
ggplot(mapping = aes(x = cancellation_policy, fill = performance)) +
geom_bar(position = "dodge") +
coord_flip(),
sea_airbnb %>%
ggplot(mapping = aes(x = cancellation_policy, fill = performance)) +
geom_bar(position = "fill") +
coord_flip(),
ncol = 1
)
Comments
In this part, I will examine the relationship between different quantitative variables:
- host_response_rate
- accommodates
- bedrooms
- reviews_per_month
- number_of_reviews
- price
- weighted_score
# Distribution of numeric variables
# correlation between price and other variables
sea_airbnb %>%
inspect_cor(with_col = "price") %>%
show_plot()
Comments
Price has apparent positive correlation with host_total_listings_count, accommodates, bedrooms, beds, and bathrooms.
Price has apparent negative correlation with reviews_per_month, number_of_reviews, weighted_score, and review_scores_rating.
Questions
# correlation between weighted score and other variables
sea_airbnb %>%
inspect_cor(with_col = "weighted_score") %>%
show_plot()
Comments
weighted_score has apparent positive correlation with host_since, reviews_per_month, number_of_reviews, and review_scores_rating.
weighted_score has apparent negative correlation with host_total_listings_count, price, and minimum_nights.
# host_response_rate across weighted score
sea_airbnb %>%
ggplot(mapping = aes(x = host_response_rate, y = weighted_score)) +
geom_point()
Comments
Some high response rates have low weighted scores.
Didn’t find apparent linear relationship between weighted_score and host_response_rate
# host_response_rate across price
sea_airbnb %>%
ggplot(mapping = aes(x = host_response_rate, y = price)) +
geom_point()
Comments
Most of the points are clustered on the price lower than $250 and response rate is higher than 0.875.
People may prefer a lower price Airbnb.
# price across accommodates
sea_airbnb %>%
ggplot(mapping = aes(x = accommodates, y = price))+geom_violin(alpha=0.5, color="gray")+geom_jitter(alpha=0.5, aes(color=accommodates), position = position_jitter(width = 0.1))
Comments
# price across bedrooms
sea_airbnb %>%
ggplot(mapping = aes(x = bedrooms, y = price))+geom_violin(alpha=0.5, color="gray")+geom_jitter(alpha=0.5, aes(color=bedrooms), position = position_jitter(width = 0.1))
Comments
# host_total_listings_count across price
sea_airbnb %>%
ggplot(mapping = aes(x = host_total_listings_count, y = price)) +
geom_point()
Comments
Most of the data points are clustered below $500 and 250 total listing counts.
Didn’t find apparent positive distributions between total listings and price.
# reviews_per_month across price
sea_airbnb %>%
ggplot(mapping = aes(x = reviews_per_month, y = price)) +
geom_point()
Comments
Highest number of reviews per month are showed on price below $250.
# price across weighted_score
sea_airbnb %>%
ggplot(mapping = aes(x = weighted_score, y = price)) +
geom_point()
Comments
Nearly all high weighted score are clustered on price below $250.
Only a few high price Airbnbs have high weighted score.
# number_of_reviews across weighted_score
sea_airbnb %>%
ggplot(mapping = aes(x = number_of_reviews, y = weighted_score)) +
geom_point()
Comments
# reviews_per_month across accommodates
sea_airbnb %>%
ggplot(mapping = aes(x = reviews_per_month, y = accommodates))+geom_violin(alpha=0.5, color="gray")+geom_jitter(alpha=0.5, aes(color=accommodates), position = position_jitter(width = 0.1))
Comments
# reveiws_per_month across bedrooms
sea_airbnb %>%
ggplot(mapping = aes(x = reviews_per_month, y = bedrooms))+geom_violin(alpha=0.5, color="gray")+geom_jitter(alpha=0.5, aes(color=bedrooms), position = position_jitter(width = 0.1))
Comments
sea_airbnb %>%
ggpairs(columns = c(5,7,15,16,20,24,25,27))
Comments
host_is_superhost balanced looking both directions on correlation graph.
instant_bookable balanced looking both directions on correlation graph.
Questions
Look more closely at:
grid.arrange(
sea_airbnb %>%
ggplot(mapping = aes(x = host_is_superhost, y = price)) +
geom_boxplot(),
sea_airbnb %>%
ggplot(mapping = aes(x = neighbourhood_group, y = price)) +
geom_boxplot() +
theme(axis.text.x = element_text(size = 7, angle = 15)),
ncol = 1
)
Comments
host_is_superhost vs price
neighbourhood_group vs price
grid.arrange(
sea_airbnb %>%
ggplot(mapping = aes(x = property_type, y = price)) +
geom_boxplot() +
theme(axis.text.x = element_text(size = 5, angle = 15)),
sea_airbnb %>%
ggplot(mapping = aes(x = performance, y = price)) +
geom_boxplot(),
ncol = 1
)
Comments
property_type vs price
performance vs price
grid.arrange(
sea_airbnb %>%
ggplot(mapping = aes(x = cancellation_policy, y = price)) +
geom_boxplot() +
theme(axis.text.x = element_text(size = 9, angle = 10)),
sea_airbnb %>%
ggplot(mapping = aes(x = instant_bookable, y = price)) +
geom_boxplot(),
ncol = 1
)
Comments
cancellation_policy vs price
instant_bookable vs price
Questions
Which cancellation policy is more welcomed?
grid.arrange(
sea_airbnb %>%
ggplot(mapping = aes(x = cancellation_policy, y = reviews_per_month)) +
geom_boxplot() +
theme(axis.text.x = element_text(size = 9, angle = 10)),
sea_airbnb %>%
ggplot(mapping = aes(x = property_type, y = reviews_per_month)) +
geom_boxplot() +
theme(axis.text.x = element_text(size = 5, angle = 15)),
ncol = 1
)
Comments
cancellation_policy vs reviews_per_month
property_type vs reviews_per_month
Questions
How about the revenue across different cancellation policies?
How about the revenue across different property types?
grid.arrange(
sea_airbnb %>%
ggplot(mapping = aes(x = instant_bookable, y = reviews_per_month)) +
geom_boxplot(),
sea_airbnb %>%
ggplot(mapping = aes(x = neighbourhood_group, y = reviews_per_month)) +
geom_boxplot() +
theme(axis.text.x = element_text(size = 7, angle = 15)),
ncol = 1
)
Comments
instant_bookable vs reviews_per_month
neighbourhood_group vs reviews_per_month
grid.arrange(
sea_airbnb %>%
ggplot(mapping = aes(x = host_is_superhost, y = reviews_per_month)) +
geom_boxplot(),
sea_airbnb %>%
ggplot(mapping = aes(x = host_is_superhost, y = weighted_score)) +
geom_boxplot(),
ncol = 1
)
Comments
host_is_superhost vs reviews_per_month
host_is_superhost vs weighted_score
sea_airbnb %>%
ggplot(mapping = aes(x = performance, y = host_response_rate)) +
geom_boxplot()
Comments
host_response_rate vs performance
# mutate annual_revenue column
# for simplicity, I assume reviews per month will be the days Airbnb booked every month.And I used monthly review numbers*12 to get the yearly booking days.
sea_airbnb$annual_rev <- sea_airbnb$price*sea_airbnb$reviews_per_month*12
grid.arrange(
sea_airbnb %>%
ggplot(mapping = aes(x = cancellation_policy, y = annual_rev)) +
geom_boxplot() +
theme(axis.text.x = element_text(size = 9, angle = 10)),
sea_airbnb %>%
ggplot(mapping = aes(x = neighbourhood_group, y = annual_rev)) +
geom_boxplot() +
theme(axis.text.x = element_text(size = 7, angle = 12)),
ncol = 1
)
Comments
cancellation_policy vs annual_revenue
neighbourhood_group vs annual_revenue
grid.arrange(
sea_airbnb %>%
ggplot(mapping = aes(x = host_is_superhost, y = annual_rev)) +
geom_boxplot(),
sea_airbnb %>%
ggplot(mapping = aes(x = instant_bookable, y = annual_rev)) +
geom_boxplot(),
ncol = 1
)
Comments
host_is_superhost vs annual_revenue
instant_bookable vs annual_revenue
sea_airbnb %>%
ggplot(mapping = aes(x = property_type, y = annual_rev)) +
geom_boxplot() +
theme(axis.text.x = element_text(size = 5, angle = 13))
Comments
Apartment and Condominium have similar annual revenue distributions.
House has more chance to earn a high revenue.
Even if Farm stay is popular, but the annual revenue is not so high.
Tree house has the lowest annual revenue.
# annual_rev/host_is_superhost/performance
sea_airbnb %>%
group_by(host_is_superhost, performance) %>%
summarise(med_revenue = median(annual_rev)) %>%
ggplot(mapping = aes(x = performance, y = med_revenue, color = host_is_superhost, group = host_is_superhost)) +
geom_line(aes(color = host_is_superhost)) +
geom_point() +
labs(x = "Performance", y = "Median Revenue", color = "Host is superhost") +
theme_classic() +
#scale_color_manual(values = c("#969696", "red"), labels = c("No", "Yes"), guide = guide_legend(reverse = TRUE)) +
scale_y_continuous(label = dollar)
Comments
For each performance level, superhost can earn more revenue than non-superhost.
sea_airbnb %>%
group_by(neighbourhood_group, instant_bookable) %>%
summarise(med_revenue = median(annual_rev)) %>%
ggplot(mapping = aes(x = neighbourhood_group, y = med_revenue, color = instant_bookable, group = instant_bookable)) +
geom_line(aes(color = instant_bookable)) +
geom_point() +
coord_flip() +
labs(x = "Neighbourhood Group", y = "Median Revenue", color = "Instant Bookable") +
theme_classic() +
#theme(axis.text.x = element_text(size = 5, angle = 13)) +
#scale_color_manual(values = c("#969696", "red"), labels = c("No", "Yes"), guide = guide_legend(reverse = TRUE)) +
scale_y_continuous(label = dollar)
Comments
sea_airbnb %>%
group_by(cancellation_policy, host_is_superhost) %>%
summarise(med_revenue = median(annual_rev)) %>%
ggplot(mapping = aes(x = cancellation_policy, y = med_revenue, color = host_is_superhost, group = host_is_superhost)) +
geom_line(aes(color = host_is_superhost)) +
geom_point() +
coord_flip() +
labs(x = "Cancellation Policy", y = "Median Revenue", color = "Host is superhost") +
theme_classic() +
#scale_color_manual(values = c("#969696", "red"), labels = c("No", "Yes"), guide = guide_legend(reverse = TRUE)) +
scale_y_continuous(label = dollar)
Comments
sea_airbnb %>%
group_by(property_type, host_is_superhost) %>%
summarise(med_price = median(price)) %>%
ggplot(mapping = aes(x = property_type, y = med_price, color = host_is_superhost, group = host_is_superhost)) +
geom_line(aes(color = host_is_superhost)) +
geom_point() +
coord_flip() +
labs(x = "Property type", y = "Median Price", color = "Host is superhost") +
theme_classic() +
#scale_color_manual(values = c("#969696", "red"), labels = c("No", "Yes"), guide = guide_legend(reverse = TRUE)) +
scale_y_continuous(label = dollar)
Comments
sea_airbnb %>%
group_by(neighbourhood_group, host_is_superhost) %>%
summarise(med_revenue = median(annual_rev)) %>%
ggplot(mapping = aes(x = neighbourhood_group, y = med_revenue, color = host_is_superhost, group = host_is_superhost)) +
geom_line(aes(color = host_is_superhost)) +
geom_point() +
coord_flip() +
labs(x = "Neighbourhood Group", y = "Median Revenue", color = "Host is superhost") +
theme_classic() +
#scale_color_manual(values = c("#969696", "red"), labels = c("No", "Yes"), guide = guide_legend(reverse = TRUE)) +
scale_y_continuous(label = dollar)
Comments
# select the top 3 popular property type
top_3_property_type <- sea_airbnb %>%
group_by(property_type) %>%
summarise(count_n = n()) %>%
arrange(desc(count_n)) %>%
slice(1:3)
sea_airbnb_prpt_t3 <- inner_join(sea_airbnb, top_3_property_type, by = "property_type")
sea_airbnb_prpt_t3 %>%
group_by(neighbourhood_group, property_type) %>%
summarise(med_price = median(price)) %>%
ggplot(mapping = aes(x = neighbourhood_group, y = med_price, color = property_type, group = property_type)) +
geom_line(aes(color = property_type)) +
geom_point() +
coord_flip() +
labs(x = "Neighbourhood Group", y = "Median Price", color = "Property Type") +
theme_classic() +
scale_y_continuous(label = dollar)
Comments
sea_airbnb %>%
group_by(instant_bookable, reviews_per_month) %>%
summarise(med_price = median(price)) %>%
ggplot(mapping = aes(x = reviews_per_month, y = med_price, color = instant_bookable)) +
geom_point() +
facet_grid(. ~ instant_bookable) +
theme_bw() +
theme(axis.text.x = element_text(face = "bold", size = 10, angle = 0),
legend.position = "bottom",
plot.title = element_text(hjust = 0.5, face = "bold"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank()) +
labs(x = "Reviews Per Month", y = "Median Price", color = "Instant Bookable") +
scale_y_continuous(labels = dollar)
Comments
Instant_bookable Airbnb has more high number of monthly reviews.
Instant_bookable Airbnb has more high price data point.
sea_airbnb %>%
group_by(performance, instant_bookable) %>%
summarise(med_price = median(price)) %>%
ggplot(mapping = aes(x = performance, y = med_price, color = instant_bookable, group = instant_bookable)) +
geom_line(aes(color = instant_bookable)) +
geom_point() +
labs(x = "Performance", y = "Median Price", color = "Instant Bookable") +
theme_classic() +
scale_y_continuous(label = dollar)
Comments
The median prices of Fair, Good and Excellent performance are all lower than $140.
The price of instant bookable across bad performance has the highest median price.
Two main findings
Establish a hypothesis
Finding 1: Being superhost can increase revenue.
Finding 2: Instant bookable can increase revenue.
Null hypothesis: Instant bookable has no affect on revenue.
Alternate hypothesis: Instant bookable has affect on revenue.
Set confidence interval
Superhost and Revenue
sea_airbnb %>%
ggplot(mapping = aes(x = host_is_superhost, y = annual_rev)) +
geom_boxplot()
According to the boxplot graph, host_is_superhost has some affect on annual revenue.
t.test(sea_airbnb$annual_rev[sea_airbnb$host_is_superhost == "Yes"], sea_airbnb$annual_rev[sea_airbnb$host_is_superhost == "No"], conf.level = 0.95)
##
## Welch Two Sample t-test
##
## data: sea_airbnb$annual_rev[sea_airbnb$host_is_superhost == "Yes"] and sea_airbnb$annual_rev[sea_airbnb$host_is_superhost == "No"]
## t = 12.802, df = 6280.9, p-value < 0.00000000000000022
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 1199.990 1633.962
## sample estimates:
## mean of x mean of y
## 5054.503 3637.527
My null hypothesis is superhost has no affect on revenue. According to the t-test, p-value is near to zero with the confidence interval of 0.95, which indicates that we can reject the null hypothesis that superhost has no affect on revenue.
# Median revenue and host_is_superhost
# 95% CI, get z-value for two tails
z <- qnorm(0.95)
sea_airbnb %>%
group_by(host_is_superhost) %>%
summarise(med_revenue = median(annual_rev), sd = sd(annual_rev), n = n(), ci = z * sd/sqrt(n)) %>%
ggplot(aes(x = host_is_superhost, y = med_revenue)) +
geom_bar(stat = "identity", position = "dodge") +
geom_errorbar(aes(ymin = med_revenue - ci, ymax = med_revenue + ci), width = 0.5, position = position_dodge(0.9))
According to the error bars, we can find that the spread of data is not so much based on our confidence interval. To be specific, the lower line of host_is_superhost “Yes” error bar does not overlap with the host_is_superhost “No” error bar, which means the data of host_is_superhost “Yes” is different from the data of host_is_superhost “No”. So we can confirm to reject our null hypothsis that superhost has no affect on median revenue.
sea_airbnb %>%
group_by(instant_bookable, neighbourhood_group) %>%
ggplot(mapping =aes(x = instant_bookable, y = annual_rev, fill = neighbourhood_group)) +
geom_boxplot() +
labs(x = "Instant bookable", y = "Annual Revenue", fill = "Neighbourhood Group")
According to the boxplot, we can find that for same neighbourhood group of Airbnb, different booking policy can bring different revenues. This may imply that the combination of instant_bookable and neighbourhood_group have affect on revenue.
# make a table of counts to calculate the confidence interval
A_F_n <- sea_airbnb %>%
group_by(instant_bookable, neighbourhood_group) %>%
summarise(n = n())
# Calculate confidence intervals using mulitnomialCI
A_F_n_ci <- multinomialCI(t(A_F_n[ ,3]), 0.05)
# Create a table with proportions
A_F_tab <- sea_airbnb %>%
group_by(instant_bookable, neighbourhood_group) %>%
summarise(prop = round(n()/sum(nrow(sea_airbnb)), 3))
# Add the confidence intervals to the table of proportions
A_F_tab$ci_l <- round(A_F_n_ci[ , 1], 3)
A_F_tab$ci_u <- round(A_F_n_ci[ , 2], 3)
# Show the table
formattable(A_F_tab)
| instant_bookable | neighbourhood_group | prop | ci_l | ci_u |
|---|---|---|---|---|
| No | Ballard | 0.027 | 0.018 | 0.037 |
| No | Beacon Hill | 0.017 | 0.008 | 0.026 |
| No | Capitol Hill | 0.046 | 0.037 | 0.056 |
| No | Cascade | 0.011 | 0.001 | 0.020 |
| No | Central Area | 0.043 | 0.033 | 0.052 |
| No | Delridge | 0.017 | 0.008 | 0.027 |
| No | Downtown | 0.043 | 0.034 | 0.052 |
| No | Interbay | 0.001 | 0.000 | 0.010 |
| No | Lake City | 0.011 | 0.002 | 0.020 |
| No | Magnolia | 0.010 | 0.001 | 0.019 |
| No | Northgate | 0.013 | 0.004 | 0.022 |
| No | Other neighborhoods | 0.106 | 0.096 | 0.115 |
| No | Queen Anne | 0.032 | 0.023 | 0.041 |
| No | Rainier Valley | 0.028 | 0.019 | 0.038 |
| No | Seward Park | 0.007 | 0.000 | 0.017 |
| No | University District | 0.015 | 0.006 | 0.025 |
| No | West Seattle | 0.030 | 0.020 | 0.039 |
| Yes | Ballard | 0.026 | 0.016 | 0.035 |
| Yes | Beacon Hill | 0.020 | 0.011 | 0.030 |
| Yes | Capitol Hill | 0.055 | 0.046 | 0.065 |
| Yes | Cascade | 0.026 | 0.017 | 0.035 |
| Yes | Central Area | 0.049 | 0.039 | 0.058 |
| Yes | Delridge | 0.015 | 0.006 | 0.025 |
| Yes | Downtown | 0.130 | 0.121 | 0.140 |
| Yes | Interbay | 0.001 | 0.000 | 0.011 |
| Yes | Lake City | 0.008 | 0.000 | 0.017 |
| Yes | Magnolia | 0.008 | 0.000 | 0.018 |
| Yes | Northgate | 0.012 | 0.003 | 0.022 |
| Yes | Other neighborhoods | 0.084 | 0.075 | 0.093 |
| Yes | Queen Anne | 0.039 | 0.030 | 0.049 |
| Yes | Rainier Valley | 0.026 | 0.017 | 0.036 |
| Yes | Seward Park | 0.004 | 0.000 | 0.014 |
| Yes | University District | 0.011 | 0.002 | 0.021 |
| Yes | West Seattle | 0.028 | 0.019 | 0.037 |
A_F_tab %>%
ggplot(aes(x = instant_bookable, y = prop, fill = neighbourhood_group)) +
geom_bar(stat = "identity", position = "dodge") +
geom_text(aes(label = round(prop, 2)), vjust = -1.5, color = "black",
position = position_dodge(0.9), size = 4) +
geom_errorbar(aes(ymin = ci_l, ymax = ci_u),
width = 0.4, position = position_dodge(0.9)) +
labs(x = "Instant Bookable", y = "Proportion", fill = "Neighbourhood group")
Based on this sample:
# Logit regression with general linear model
mod <- glm(annual_rev ~ host_is_superhost + instant_bookable + neighbourhood_group, data = sea_airbnb)
summary(mod)
##
## Call:
## glm(formula = annual_rev ~ host_is_superhost + instant_bookable +
## neighbourhood_group, data = sea_airbnb)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -7234 -2678 -984 1467 58562
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 2795.4 248.3 11.258
## host_is_superhostYes 1644.3 109.6 14.998
## instant_bookableYes 1501.4 109.9 13.656
## neighbourhood_groupBeacon Hill 154.0 359.9 0.428
## neighbourhood_groupCapitol Hill 953.8 285.8 3.338
## neighbourhood_groupCascade -411.8 362.6 -1.136
## neighbourhood_groupCentral Area 526.9 291.0 1.811
## neighbourhood_groupDelridge -1008.7 374.4 -2.694
## neighbourhood_groupDowntown 557.7 267.9 2.081
## neighbourhood_groupInterbay -362.3 1241.2 -0.292
## neighbourhood_groupLake City -1841.9 455.7 -4.042
## neighbourhood_groupMagnolia -237.1 458.7 -0.517
## neighbourhood_groupNorthgate -1637.3 407.3 -4.019
## neighbourhood_groupOther neighborhoods -851.3 261.8 -3.252
## neighbourhood_groupQueen Anne 1422.5 305.6 4.655
## neighbourhood_groupRainier Valley -1045.6 324.8 -3.219
## neighbourhood_groupSeward Park -1762.8 546.0 -3.228
## neighbourhood_groupUniversity District -1993.5 401.5 -4.965
## neighbourhood_groupWest Seattle -370.8 320.7 -1.156
## Pr(>|t|)
## (Intercept) < 0.0000000000000002 ***
## host_is_superhostYes < 0.0000000000000002 ***
## instant_bookableYes < 0.0000000000000002 ***
## neighbourhood_groupBeacon Hill 0.66876
## neighbourhood_groupCapitol Hill 0.00085 ***
## neighbourhood_groupCascade 0.25615
## neighbourhood_groupCentral Area 0.07026 .
## neighbourhood_groupDelridge 0.00708 **
## neighbourhood_groupDowntown 0.03744 *
## neighbourhood_groupInterbay 0.77040
## neighbourhood_groupLake City 0.000053683 ***
## neighbourhood_groupMagnolia 0.60525
## neighbourhood_groupNorthgate 0.000059004 ***
## neighbourhood_groupOther neighborhoods 0.00115 **
## neighbourhood_groupQueen Anne 0.000003314 ***
## neighbourhood_groupRainier Valley 0.00129 **
## neighbourhood_groupSeward Park 0.00125 **
## neighbourhood_groupUniversity District 0.000000704 ***
## neighbourhood_groupWest Seattle 0.24765
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 17842552)
##
## Null deviance: 124977025806 on 6300 degrees of freedom
## Residual deviance: 112086913946 on 6282 degrees of freedom
## AIC: 123111
##
## Number of Fisher Scoring iterations: 2
# plot residuals to check for patterns
par(mfrow = c(1, 1))
plot(sea_airbnb$annual_rev, mod$residuals)
par(mfrow = c(1, 1))
plot(sea_airbnb$price, mod$residuals)
plot(sea_airbnb$host_is_superhost, mod$residuals)
plot(sea_airbnb$instant_bookable, mod$residuals)
plot(sea_airbnb$neighbourhood_group, mod$residuals)
# Pull out the coefficients and confidence interval for table and graph
coe <- summary(mod)$coefficients
coe_CI <- as.data.frame(cbind(coe[-1, ], confint(mod)[-1, ]))
# Rename results data frame
names(coe_CI) <- c("estimate", "se", "t", "pval", "low_CI", "high_CI")
htmlTable(round(coe_CI[order(coe_CI$pval, decreasing = FALSE), ], 1))
| estimate | se | t | pval | low_CI | high_CI | |
|---|---|---|---|---|---|---|
| host_is_superhostYes | 1644.3 | 109.6 | 15 | 0 | 1429.4 | 1859.1 |
| instant_bookableYes | 1501.4 | 109.9 | 13.7 | 0 | 1285.9 | 1716.8 |
| neighbourhood_groupUniversity District | -1993.5 | 401.5 | -5 | 0 | -2780.3 | -1206.6 |
| neighbourhood_groupQueen Anne | 1422.5 | 305.6 | 4.7 | 0 | 823.5 | 2021.5 |
| neighbourhood_groupLake City | -1841.9 | 455.7 | -4 | 0 | -2735.1 | -948.7 |
| neighbourhood_groupNorthgate | -1637.3 | 407.3 | -4 | 0 | -2435.6 | -838.9 |
| neighbourhood_groupCapitol Hill | 953.8 | 285.8 | 3.3 | 0 | 393.7 | 1513.9 |
| neighbourhood_groupOther neighborhoods | -851.3 | 261.8 | -3.3 | 0 | -1364.4 | -338.2 |
| neighbourhood_groupSeward Park | -1762.8 | 546 | -3.2 | 0 | -2832.9 | -692.6 |
| neighbourhood_groupRainier Valley | -1045.6 | 324.8 | -3.2 | 0 | -1682.2 | -409 |
| neighbourhood_groupDelridge | -1008.7 | 374.4 | -2.7 | 0 | -1742.6 | -274.9 |
| neighbourhood_groupDowntown | 557.7 | 267.9 | 2.1 | 0 | 32.5 | 1082.8 |
| neighbourhood_groupCentral Area | 526.9 | 291 | 1.8 | 0.1 | -43.5 | 1097.3 |
| neighbourhood_groupWest Seattle | -370.8 | 320.7 | -1.2 | 0.2 | -999.5 | 257.8 |
| neighbourhood_groupCascade | -411.8 | 362.6 | -1.1 | 0.3 | -1122.4 | 298.9 |
| neighbourhood_groupMagnolia | -237.1 | 458.7 | -0.5 | 0.6 | -1136.2 | 662 |
| neighbourhood_groupBeacon Hill | 154 | 359.9 | 0.4 | 0.7 | -551.5 | 859.5 |
| neighbourhood_groupInterbay | -362.3 | 1241.2 | -0.3 | 0.8 | -2795 | 2070.5 |
g1 <- ggplot(coe_CI, aes(x = estimate, y = reorder(row.names(coe_CI),desc(pval)))) +
geom_point(size = 3) +
xlim(min(coe_CI$low_CI), max(coe_CI$high_CI)) +
ylab("Variable") +
xlab("Coefficient") +
theme_bw()
g2 <- g1 +
geom_segment(aes(yend = reorder(row.names(coe_CI),desc(pval))),
xend = coe_CI$high_CI, color = "Blue") +
geom_segment(aes(yend = reorder(row.names(coe_CI),desc(coe_CI$pval))),
xend = coe_CI$low_CI, color = "Blue") +
xlab("Coefficient with Confidence Interval")
g3 <- g2 +
geom_vline(xintercept = 0, color = "red")
g3
Comments
The coefficients of neighbourhood_groupWest Seattle, neighbourhood_groupCascade, neighbourhood_groupMagolia, neighbourhood_groupBeacon Hill and neighbourhood_groupInterbay are statistically close to zero, which indicates that they have little impact on revenue and are less statistically significant.
In summary, I would conclude that being the superhost, neighbourhood group, instant bookable have affect on revenue.
- Revenue is higher for superhost.
- Revenue is higher for instant bookable Airbnb.
- Revenue is higher in some specific neighbourhood groups.
rev_suphost_perform <- sea_airbnb %>%
group_by(host_is_superhost, performance) %>%
summarise(med_revenue = median(annual_rev)) %>%
ggplot(mapping = aes(x = performance, y = med_revenue, color = host_is_superhost, group = host_is_superhost)) +
geom_line(aes(color = host_is_superhost)) +
geom_point() +
geom_hline(yintercept = 6050, linetype = 2, color = "black") +
ggtitle("Being Superhost Can Earn More Revenue Per Year", subtitle = "Median revenue of superhost is higher than non-superhost") +
labs(x = "Performance", y = "Median Revenue", color = "Host is superhost") +
theme_classic() +
theme(plot.title = element_text(face = "bold"),
axis.ticks.x = element_blank(),
axis.text.x = element_text(face = "bold"),
legend.position = c(0.85, 0.17)) +
scale_color_manual(values = c("#969696", "red"), labels = c("No", "Yes"), guide = guide_legend(reverse = TRUE)) +
scale_y_continuous(label = dollar)
# show the graph
rev_suphost_perform
rev_neighbour_book <- sea_airbnb %>%
group_by(neighbourhood_group, instant_bookable) %>%
summarise(med_revenue = median(annual_rev)) %>%
ggplot(mapping = aes(x = neighbourhood_group, y = med_revenue, color = instant_bookable, group = instant_bookable)) +
geom_line(aes(color = instant_bookable)) +
geom_point() +
ggtitle("Making Your Home Instant Bookable", subtitle = "Revenue of Instant Bookable Airbnb home is higher") +
coord_flip() +
labs(x = "Neighbourhood Group", y = "Median Revenue", color = "Instant Bookable") +
annotate("text", x = 16.5, y = 5150, label = "High revenue range", color = "black", size = 3.5) +
geom_hline(yintercept = 6200, linetype = 2, color = "black", alpha = 0.5) +
geom_hline(yintercept = 4000, linetype = 2, color = "black", alpha = 0.5) +
theme_classic() +
theme(plot.title = element_text(face = "bold"),
axis.ticks.x = element_blank(),
axis.line.x = element_blank(),
axis.ticks.y = element_blank(),
axis.line.y = element_blank(),
axis.text.x = element_text(face = "bold"),
legend.position = "bottom") +
scale_color_manual(values = c("#969696", "red"), labels = c("No", "Yes"), guide = guide_legend(reverse = TRUE)) +
scale_y_continuous(label = dollar)
# show the graph
rev_neighbour_book
sea_airbnb$high_monthly_reviews[sea_airbnb$reviews_per_month >= 10] = "high"
sea_airbnb$high_monthly_reviews[sea_airbnb$reviews_per_month < 10] = "low"
sea_airbnb$high_monthly_reviews = factor(sea_airbnb$high_monthly_reviews, levels=c("low", "high"))
review_price <- sea_airbnb %>%
group_by(high_monthly_reviews) %>%
ggplot(mapping = aes(x = reviews_per_month, y = price, color = high_monthly_reviews)) +
geom_point(alpha = 0.8) +
ggtitle("What Price Range Is More Welcomed??", subtitle = "Home price lower than $250") +
labs(x = "Reviews Per Month", y = "Price") +
geom_hline(yintercept = 250, linetype = 2, color = "black") +
#annotate("text", x = 16.5, y = 5150, label = "High revenue range", color = "black", size = 3.5)
scale_color_manual(values=c("#999999", "red")) +
theme_classic() +
theme(plot.title = element_text(face = "bold"),
axis.ticks.x = element_blank(),
axis.line.x = element_blank(),
axis.ticks.y = element_blank(),
axis.line.y = element_blank(),
legend.position = "none") +
scale_y_continuous(label = dollar)
# show the graph
review_price
price_neighbour <- sea_airbnb %>%
group_by(neighbourhood_group) %>%
summarise(med_price = median(price)) %>%
ggplot(mapping = aes(x = neighbourhood_group, y = med_price)) +
stat_summary(fun.y=median,geom="line",lwd=0.6,aes(group=1)) +
coord_flip() +
ggtitle("Which Neighbourhood Groups Have High Home Price??", subtitle = "Downtown, Cascade and Queen Anne have highest median price") +
labs(x = "Neighbourhood Group", y = "Median Price") +
geom_vline(xintercept = 13, linetype = 2, color = "red", alpha = 0.75) +
geom_vline(xintercept = 7, linetype = 2, color = "red", alpha = 0.75) +
geom_vline(xintercept = 4, linetype = 2, color = "red", alpha = 0.75) +
theme_classic() +
theme(plot.title = element_text(face = "bold"),
axis.ticks.x = element_blank(),
axis.line.x = element_blank(),
axis.ticks.y = element_blank(),
axis.line.y = element_blank(),
legend.position = "none") +
scale_y_continuous(label = dollar)
# show the graph
price_neighbour
ggsave(filename = "rev_suphost_perform.png", plot = rev_suphost_perform)
## Saving 7 x 5 in image
ggsave(filename = "rev_neighbour_book.png", plot = rev_neighbour_book)
## Saving 7 x 5 in image
ggsave(filename = "review_price.png", plot = review_price)
## Saving 7 x 5 in image
ggsave(filename = "price_neighbour.png", plot = price_neighbour)
## Saving 7 x 5 in image